##This R file reproduces GSS-based results reported in "Social Mobility as Causal Intervention", Lai Wei and Yu Xie

library(boot)
library(tidyverse)
library(haven)
library(CBPS)
library(survey)
library(naniar)
library(gridExtra)
library(butils)
library(MatchIt)
library(randomForest)
library(SuperLearner)
library(sjPlot)
library(sl3)
library(gbm)
library(e1071)
library(entropy)
library(xtable)
library(mice)
library(cowplot)
library(mipfp)


setwd("Your Working Directory")

load("gss_fertility.rDa")

controls <- c("age","sibs","ethnic",
              "res16","reg16","family16","relig16","fund16","incom16",
              "pawrkslf","parborn","padeg","madeg")


#######Outcome Model
#######First, we train a random forest to predict expected fertility by educational level and background covairates
#######This model provides causal effect of education on fertility 


set.seed(08540)

fertility_task <- make_sl3_Task(
  data = gss_fertility,
  covariates = c("degree",controls),
  outcome = "childs",
  outcome_type = "continuous")

sl3_list_learners("continuous") 

stack <- make_learner_stack(
  "Lrnr_ranger","Lrnr_glmnet"
)



sl3_list_learners("continuous") 

meta <- make_learner("Lrnr_glm")

sl <- make_learner(Lrnr_sl, 
                   learners = stack,
                   metalearner = meta)

fit_start <- Sys.time()

sl_fit <- sl$train(fertility_task)

fit_end <- Sys.time()

fit_duration <- fit_end - fit_start

fit_duration



#######Get predicted outcomes after receiving specific treatments


gss_fertility0 <- gss_fertility %>% dplyr::select(-degree) %>% mutate(degree = as.factor(rep(0,nrow(gss_fertility))),.keep = "all")
gss_fertility1 <- gss_fertility %>% dplyr::select(-degree) %>% mutate(degree = as.factor(rep(1,nrow(gss_fertility))),.keep = "all")
gss_fertility2 <- gss_fertility %>% dplyr::select(-degree) %>% mutate(degree = as.factor(rep(2,nrow(gss_fertility))),.keep = "all")
gss_fertility0$degree <- factor(gss_fertility0$degree, levels = c(0,1,2))
gss_fertility1$degree <- factor(gss_fertility1$degree, levels = c(0,1,2))
gss_fertility2$degree <- factor(gss_fertility2$degree, levels = c(0,1,2))

gss_fertility$pred <- sl_fit$predict(make_sl3_Task(
  data = gss_fertility,
  covariates = c(controls,"degree"),
  outcome = "childs",
  outcome_type = "continuous"))

gss_fertility$pred0 <- sl_fit$predict(make_sl3_Task(
  data = gss_fertility0,
  covariates = c(controls,"degree"),
  outcome = "childs",
  outcome_type = "continuous"))

gss_fertility$pred1 <- sl_fit$predict(make_sl3_Task(
  data = gss_fertility1,
  covariates = c(controls,"degree"),
  outcome = "childs",
  outcome_type = "continuous"))

gss_fertility$pred2 <- sl_fit$predict(make_sl3_Task(
  data = gss_fertility2,
  covariates = c(controls,"degree"),
  outcome = "childs",
  outcome_type = "continuous"))


#######Treatment Model

####Functional Setups

##two tables one intervention one observed,origin as rows and destination as columns; reference as first column
##In the intervention where observed treatment is considered, obs is (OD)DPSI table with DDPSI independence, intervention is the ODDPSI table output by function DDPSI


intervention.delta <- function(intervention,observed){
  #adding a small value to avoid 0  
  intervention <- intervention + 0.1
  observed <- observed + 0.1
  
  mat.int <- matrix(nrow=nrow(intervention),ncol=ncol(intervention))
  for (i in 1:nrow(intervention)) {
    for (j in 1:ncol(intervention)) {
      mat.int[i,j] <- intervention[i,j]/intervention[i,1]
    }
  }
  
  mat.obs <- matrix(nrow=nrow(observed),ncol=ncol(observed))
  for (i in 1:nrow(observed)) {
    for (j in 1:ncol(observed)) {
      mat.obs[i,j] <- observed[i,j]/observed[i,1]
    }
  }
  
  return(mat.int/mat.obs)
}



##Post-Intervention Propensity Score: origin is origin column, pscores are data columns of pscores, delta is the delta matrix as calculated by intervention.delta
pips <- function(origin,pscores,delta){
  
  temp <- matrix(nrow=nrow(pscores),ncol=ncol(pscores))
  
  for (i in 1:length(unique(origin))) {
    temp[origin==i-1,] <- t(t(pscores[origin==i-1,]) * delta[i,]) / as.vector((pscores[origin==i-1,] %*% matrix(delta[i,])))
  }
  
  return(temp)
}


#Function: Step-Wise OR (Code work for 3-category OD only, ten steps, 0 means total independence, 10 means observed data)
stepor <- function(table){
  
  d <- as.data.frame(table)
  # Fit the model
  mod <- glm( Freq ~ Var1 + Var2 + Var1*Var2, data=d, family=poisson("log") )
  
  for (i in 0:10) {
    d[,paste0("asso",i)] <- 0
    d[c(5:6,8:9),paste0("asso",i)] <- 0.1*i* coef(mod)[6:9] 
    d[,paste0("pred",i)] <- predict(glm(as.formula(paste0("Freq ~ Var1 + Var2 + offset(",paste0("asso",i),")")), data=d, family=poisson("log") ), type="response")  
  }
  
  return(d)
}

stepdat <- stepor(table(gss_fertility$padeg,gss_fertility$degree))

#Total Equalization

obs.dat <- as.data.frame(table(gss_fertility$padeg,gss_fertility$degree))

obs.dat$ind <- predict(glm(Freq ~ Var1 + Var2, data = obs.dat, family = poisson("log")),type="response")





obs <- xtabs(Freq ~ Var1 + Var2, obs.dat)
ind <- xtabs(ind ~ Var1 + Var2, obs.dat)


for (i in 0:10) {
  assign(paste0("step.",i) , xtabs(as.formula(paste0(paste0("pred",i),"~ Var1 + Var2")),stepdat) )
}



#Deltas: interventional effect on probability of receiving treatment
intervention.delta(ind,obs)


#######PIPS (Post-Intervention Propensity Score) Estimation

pscore <- multinom(degree ~ ., gss_fertility %>% dplyr::select(degree,controls))

gss_fertility[,c("prob0","prob1","prob2")] <- predict(pscore,newdata = gss_fertility %>% dplyr::select(degree,controls),"probs")

gss_fertility[,c("pips.ind0","pips.ind1","pips.ind2")] <- pips(origin=gss_fertility$padeg,pscores = as.matrix(gss_fertility[,c("prob0","prob1","prob2")]),delta = intervention.delta(ind,obs))

for (i in 0:10) {
  gss_fertility[,c(paste0("pips.step",i,".0"),paste0("pips.step",i,".1"),paste0("pips.step",i,".2"))] <- pips(origin=gss_fertility$padeg,pscores = as.matrix(gss_fertility[,c("prob0","prob1","prob2")]),delta = intervention.delta(eval(as.name(paste0("step.",i))),obs))
}

#######PIERO (Post-Intervention Expected Realized Outcome) Estimation
gss_fertility[,"piero.ind"] <- apply(gss_fertility[,c("pips.ind0","pips.ind1","pips.ind2")] *
                                       gss_fertility[,c("pred0","pred1","pred2")],1,sum)

for (i in 0:10) {
  gss_fertility[,paste0("piero.step",i)] <- apply(gss_fertility[,c(paste0("pips.step",i,".0"),paste0("pips.step",i,".1"),paste0("pips.step",i,".2"))] *
                                                    gss_fertility[,c("pred0","pred1","pred2")],1,sum)
}

#Check normalized
#######MIE Estimation: Little even for total independence, no need for other types of interventions
gss_fertility$probt <- ifelse(gss_fertility$degree==0,gss_fertility$prob0,
                              ifelse(gss_fertility$degree==1,gss_fertility$prob1,gss_fertility$prob2))
gss_fertility$pips.indt <- ifelse(gss_fertility$degree==0,gss_fertility$pips.ind0,
                                  ifelse(gss_fertility$degree==1,gss_fertility$pips.ind1,gss_fertility$pips.ind2))
gss_fertility$probchange <- gss_fertility$prob0*(gss_fertility$pips.ind1+gss_fertility$pips.ind2)+gss_fertility$prob1*(gss_fertility$pips.ind0+gss_fertility$pips.ind2)+gss_fertility$prob2*(gss_fertility$pips.ind0+gss_fertility$pips.ind1)

mean(abs(gss_fertility$probchange))

gss_fertility$bene <- ifelse(gss_fertility$padeg==0,1,0)

#Non-Normalized MIE
mie <- mean(gss_fertility$piero.ind) - mean(gss_fertility$childs)
mieb <- (mean(gss_fertility[gss_fertility$bene==1,"piero.ind"]) - mean(gss_fertility[gss_fertility$bene==1,"childs"]))
mieg <- (mean(gss_fertility[gss_fertility$bene==0,"piero.ind"]) - mean(gss_fertility[gss_fertility$bene==0,"childs"]))

stepeffs <- c()
for (i in 0:10) {
  stepeffs[i+1] <- mean(gss_fertility[,paste0("piero.step",i)]) - mean(gss_fertility$childs)
}


#Normalized MIE: MIE/P(D^\psi!=D)    nweight=P(D^\psi!=D)
nweight <- mean(gss_fertility$probchange)
nmie <- mie/nweight

#Normalized MIE: beneficiary
nmieb <- mieb/mean(gss_fertility[gss_fertility$bene==1,"probchange"])

#Normalized MIE: giver
nmieg <- mieg/mean(gss_fertility[gss_fertility$bene==0,"probchange"])



#######GCE Estimation
gss_fertility %>% group_by(padeg) %>% summarise(mean = mean(childs))
gss_fertility %>% group_by(padeg) %>% summarise(mean = mean(piero.ind))

gss_fertility %>% group_by(degree) %>% summarise(mean = mean(childs))
gss_fertility %>% group_by(degree) %>% summarise(mean = mean(piero.ind))



#########Bootstrapping SE  

bootstat <- function(dat,i){
  
  data <- dat[i,]
  ####Setting table
  
  obs.dat <- as.data.frame(table(data$padeg,data$degree))
  
  obs.dat$ind <- predict(glm(Freq ~ Var1 + Var2, data = obs.dat, family = poisson("log")),type="response")
  
  ####PIPO
  


  
  fertility_task <- make_sl3_Task(
    data = data,
    covariates = c("degree",controls),
    outcome = "childs",
    outcome_type = "continuous")
  
  sl3_list_learners("continuous") 
  
  stack <- make_learner_stack(
    "Lrnr_ranger","Lrnr_glmnet"
  )
  
  
  sl3_list_learners("continuous") 
  
  meta <- make_learner("Lrnr_glm")
  
  sl <- make_learner(Lrnr_sl, 
                     learners = stack,
                     metalearner = meta)
  
  sl_fit <- sl$train(fertility_task)
  


  #Get PIPOs
  gss_fertility0 <- data %>% dplyr::select(-degree) %>% mutate(degree = as.factor(rep(0,nrow(data))),.keep = "all")
  gss_fertility1 <- data %>% dplyr::select(-degree) %>% mutate(degree = as.factor(rep(1,nrow(data))),.keep = "all")
  gss_fertility2 <- data %>% dplyr::select(-degree) %>% mutate(degree = as.factor(rep(2,nrow(data))),.keep = "all")
  gss_fertility0$degree <- factor(gss_fertility0$degree, levels = c(0,1,2))
  gss_fertility1$degree <- factor(gss_fertility1$degree, levels = c(0,1,2))
  gss_fertility2$degree <- factor(gss_fertility2$degree, levels = c(0,1,2))

  data$pred <- sl_fit$predict(make_sl3_Task(
    data = data,
    covariates = c(controls,"degree"),
    outcome = "childs",
    outcome_type = "continuous"))

  data$pred0 <- sl_fit$predict(make_sl3_Task(
    data = gss_fertility0,
    covariates = c(controls,"degree"),
    outcome = "childs",
    outcome_type = "continuous"))

  data$pred1 <- sl_fit$predict(make_sl3_Task(
    data = gss_fertility1,
    covariates = c(controls,"degree"),
    outcome = "childs",
    outcome_type = "continuous"))

  data$pred2 <- sl_fit$predict(make_sl3_Task(
    data = gss_fertility2,
    covariates = c(controls,"degree"),
    outcome = "childs",
    outcome_type = "continuous"))

  
  #######PIPS Estimation
  data <- data %>% drop_na()
  
  pscore <- multinom(degree ~ ., data %>% dplyr::select(degree,controls))
  
  data[,c("prob0","prob1","prob2")] <- predict(pscore,newdata = data %>% dplyr::select(degree,controls),"probs")
  
  data[,c("pips.ind0","pips.ind1","pips.ind2")] <- pips(origin=data$padeg,pscores = as.matrix(data[,c("prob0","prob1","prob2")]),delta = intervention.delta(ind,obs))
  
  #######Steps
  stepdat <- stepor(table(data$padeg,data$degree))
  for (i in 0:10) {
    assign(paste0("step.",i) , xtabs(as.formula(paste0(paste0("pred",i),"~ Var1 + Var2")),stepdat) )
  }
  for (i in 0:10) {
    data[,c(paste0("pips.step",i,".0"),paste0("pips.step",i,".1"),paste0("pips.step",i,".2"))] <- pips(origin=data$padeg,pscores = as.matrix(data[,c("prob0","prob1","prob2")]),delta = intervention.delta(eval(as.name(paste0("step.",i))),obs))
  }
  for (i in 0:10) {
    data[,paste0("piero.step",i)] <- apply(gss_fertility[,c(paste0("pips.step",i,".0"),paste0("pips.step",i,".1"),paste0("pips.step",i,".2"))] *
                                             gss_fertility[,c("pred0","pred1","pred2")],1,sum)
  }
  
  
  #######PIERO Estimation
  data[,"piero.ind"] <- apply(data[,c("pips.ind0","pips.ind1","pips.ind2")] *
                                data[,c("pred0","pred1","pred2")],1,sum)
  
  
  
  #Check normalized
  #######MIE Estimation: Little even for total independence, no need for other types of interventions
  data$probt <- ifelse(data$degree==0,data$prob0,
                       ifelse(data$degree==1,data$prob1,data$prob2))
  data$pips.indt <- ifelse(data$degree==0,data$pips.ind0,
                           ifelse(data$degree==1,data$pips.ind1,data$pips.ind2))
  data$probchange <- data$prob0*(data$pips.ind1+data$pips.ind2)+data$prob1*(data$pips.ind0+data$pips.ind2)+data$prob2*(data$pips.ind0+data$pips.ind1)
  
  return( 
    c(
      #Non-Normalized MIE
      mean(data$piero.ind) - mean(data$childs),
      
      #Normalized MIE
      (mean(data$piero.ind) - mean(data$childs))/mean(data$probchange),
      
      #Normalized MIE: beneficiary
      (mean(data[data$padeg==0,"piero.ind"]) - mean(data[data$padeg==0,"childs"]))/
        mean(data[data$padeg==0,"probchange"]),
      
      #Normalized MIE: giver
      (mean(data[data$padeg!=0,"piero.ind"]) - mean(data[data$padeg!=0,"childs"]))/
        mean(data[data$padeg!=0,"probchange"]),
      
      #Steps
      mean(data[,paste0("piero.step",0)]) - mean(data$childs),
      mean(data[,paste0("piero.step",1)]) - mean(data$childs),
      mean(data[,paste0("piero.step",2)]) - mean(data$childs),
      mean(data[,paste0("piero.step",3)]) - mean(data$childs),
      mean(data[,paste0("piero.step",4)]) - mean(data$childs),
      mean(data[,paste0("piero.step",5)]) - mean(data$childs),
      mean(data[,paste0("piero.step",6)]) - mean(data$childs),
      mean(data[,paste0("piero.step",7)]) - mean(data$childs),
      mean(data[,paste0("piero.step",8)]) - mean(data$childs),
      mean(data[,paste0("piero.step",9)]) - mean(data$childs),
      mean(data[,paste0("piero.step",10)]) - mean(data$childs),
      
      
      #######GCE Estimation
      as.data.frame(data %>% group_by(padeg) %>% summarise(mean = mean(childs)))[,2],
      as.data.frame(data %>% group_by(padeg) %>% summarise(mean = mean(piero.ind)))[,2],
      
      as.data.frame(data %>% group_by(degree) %>% summarise(mean = mean(childs)))[,2],
      as.data.frame(data %>% group_by(degree) %>% summarise(mean = mean(piero.ind)))[,2],
      
      mean(data[data$padeg==0,"pred0"]),
      mean(data[data$padeg==1,"pred0"]),
      mean(data[data$padeg==2,"pred0"]),
      mean(data[data$padeg==0,"pred1"]),
      mean(data[data$padeg==1,"pred1"]),
      mean(data[data$padeg==2,"pred1"]),
      mean(data[data$padeg==0,"pred2"]),
      mean(data[data$padeg==1,"pred2"]),
      mean(data[data$padeg==2,"pred2"])
      
      
      
      
    )
    
  )
}


boot_result <- butils::bootPb(data=gss_fertility,statistic=bootstat,R=500)

boot_sd <- apply(boot_result[["t"]],2,sd)


vis1 <- as.data.frame(cbind(c("Total","Gainer","Giver"),
                            c(mie,mieb,mieg),
                            c(boot_sd[2:4]* c(mean(abs(gss_fertility[gss_fertility$padeg==0,"probchange"])),
                                              mean(abs(gss_fertility[gss_fertility$padeg==0,"probchange"])), 
                                              mean(abs(gss_fertility[gss_fertility$padeg!=0,"probchange"])))
                            )))




colnames(vis1) <- c("name","mean","se")



vis2 <- gss_fertility %>% group_by(padeg) %>% summarise(obs= mean(childs),
                                                        ind=mean(piero.ind))
vis2 <- as.data.frame(cbind(c(0,1,2,0,1,2),
                            c(1,1,1,2,2,2),
                            c(vis2$obs,vis2$ind),
                            boot_sd[5:10]))

colnames(vis2) <- c("padeg","intervention","mean","se")

vis3 <- gss_fertility %>% group_by(degree) %>% summarise(obs= mean(childs),
                                                         ind=mean(piero.ind))
vis3 <- as.data.frame(cbind(c(0,1,2,0,1,2),
                            c(1,1,1,2,2,2),
                            c(vis3$obs,vis3$ind),
                            boot_sd[11:16]))

colnames(vis3) <- c("padeg","intervention","mean","se")


vis4 <- as.data.frame(cbind(c(0,0,0,1,1,1),
                            c(0,1,2,0,1,2),
                            c(mean(gss_fertility[gss_fertility$padeg==0,"pred1"])-mean(gss_fertility[gss_fertility$padeg==0,"pred0"]),
                              mean(gss_fertility[gss_fertility$padeg==1,"pred1"])-mean(gss_fertility[gss_fertility$padeg==1,"pred0"]),
                              mean(gss_fertility[gss_fertility$padeg==2,"pred1"])-mean(gss_fertility[gss_fertility$padeg==2,"pred0"]),
                              mean(gss_fertility[gss_fertility$padeg==0,"pred2"])-mean(gss_fertility[gss_fertility$padeg==0,"pred1"]),
                              mean(gss_fertility[gss_fertility$padeg==1,"pred2"])-mean(gss_fertility[gss_fertility$padeg==1,"pred1"]),
                              mean(gss_fertility[gss_fertility$padeg==2,"pred2"])-mean(gss_fertility[gss_fertility$padeg==2,"pred1"])),
                            c(sd(boot_result[["t"]][,20]-boot_result[["t"]][,17]),
                              sd(boot_result[["t"]][,21]-boot_result[["t"]][,18]),
                              sd(boot_result[["t"]][,22]-boot_result[["t"]][,19]),  
                              sd(boot_result[["t"]][,23]-boot_result[["t"]][,20]),
                              sd(boot_result[["t"]][,24]-boot_result[["t"]][,21]),
                              sd(boot_result[["t"]][,25]-boot_result[["t"]][,12]))
))

colnames(vis4) <- c("degree","padeg","mean","se")



#The following reproduces Figure 2

ggplot(data = vis4,
       aes(x=as.factor(padeg),y=as.numeric(mean),shape=as.factor(degree)))+
  geom_point(size=5,position=position_dodge(width=0.4))+
  geom_errorbar(width=0.2,position=position_dodge(width=0.4),aes(ymin=as.numeric(mean)-1.96*as.numeric(se),ymax=as.numeric(mean)+1.96*as.numeric(se)))+
  scale_shape_discrete(breaks=c(0,1),label=c("HS vs Lower than HS","College vs HS"))+
  scale_x_discrete(breaks=c(0,1,2),labels=c("Less than HS","HS","College"))+
  labs(x="Father Education",y="Education Effect on Fertility",shape="Education Effect")+
  theme_bw()+
  theme(text = element_text(size=20))

#The following reproduces Figure 3

ggplot(data = vis1,
       aes(x=name,y=as.numeric(mean)))+
  geom_point()+
  geom_errorbar(width=0.2,aes(ymin=as.numeric(mean)-1.96*as.numeric(se),ymax=as.numeric(mean)+1.96*as.numeric(se)))+
  scale_color_brewer(palette = "Dark2",breaks=c(1,2,3),label=c("Lowe than HS","High School","College"))+
  labs(x="Influenced Population",y="Social Mobility Effect of Total Independence")+
  theme_bw()+
  scale_x_discrete(labels=c("Gainer","Giver","Total"))+
  theme(text = element_text(size=20))

#The following reproduces Figure 4


grid.arrange(
  
  ggplot(data = vis2,
         aes(x=factor(intervention),y=as.numeric(mean),shape=as.factor(padeg)))+
    geom_point(size=5)+
    geom_errorbar(width=0.2,aes(ymin=as.numeric(mean)-1.96*as.numeric(se),ymax=as.numeric(mean)+1.96*as.numeric(se)))+
    scale_shape_discrete(breaks=c(0,1,2),label=c("Lowe than HS","High School","College"))+
    scale_x_discrete(breaks=c(1,2),labels=c("Observed","Independence"))+
    scale_y_continuous(limit = c(1.8,2.9))+
    labs(x="Mobility Scheme",y="Fertility",shape="Father Education")+
    theme_bw()+
    theme(text = element_text(size=20))
  ,
  ggplot(data = vis3,
         aes(x=factor(intervention),y=as.numeric(mean),shape=as.factor(padeg)))+
    geom_point(size=5)+
    geom_errorbar(width=0.2,aes(ymin=as.numeric(mean)-1.96*as.numeric(se),ymax=as.numeric(mean)+1.96*as.numeric(se)))+
    scale_shape_discrete(breaks=c(0,1,2),label=c("Lowe than HS","High School","College"))+
    scale_x_discrete(breaks=c(1,2),labels=c("Observed","Independence"))+
    scale_y_continuous(limit = c(1.8,2.9))+
    labs(x="Mobility Scheme",y="Fertility",shape="Education")+
    theme_bw()+
    theme(text = element_text(size=20))
  ,
  nrow=1
  
)


save.image("gss_replication.RData")
